Runtime Prediction via QC

About

This notebook is designed to 1) create a dataframe that brings together runtime information with the information provided by the QC group. It then 2) calculates the correlation of QC variables to runtime for a core variant calling workflows. Based on those correlations, it 3) generates a linear regression model to predict runtime from QC variables and then 4) evaluates the performance of this model on ~20% of the dataset that was excluded from the training.



In [1]:

    
# imports
import pandas as pd
import matplotlib.pyplot as plt

# this allows plots to appear directly in the notebook
%matplotlib inline



In [2]:

    
import math
# read a dict for runtimes
f = open("broad_timing.tsv", "r")
runtimes = {}
for line in iter(f):
    a = line.split("\t")
    runtimes[a[0]] = a[1]
f.close()

# read data into a DataFrame
data = pd.read_csv('PCAWG-QC_Summary-of-Measures_QC_Measures.tsv', delimiter='\t')
data['runtime'] = 0.0
# loop over and annotate with runtime
for index, row in data.iterrows():
    key = row['Project_code'] + "::" + row['Submitter_donor_ID']
    try:
        curr_runtime = math.log(float(runtimes[key]))
        #curr_runtime = float(runtimes[key])
        data.set_value(index, 'runtime', curr_runtime)
    except:
        continue
        
data.head()









    Out[2]:






  
    
      
      Project_code
      Submitter_donor_ID
      Normal_WGS_aliquot_ID
      Tumour_WGS_aliquot_ID
      Stars
      Mean_Coverage_Normal
      Mean_Coverage_Tumour
      Median/Mean_Coverage_Normal
      FWHM_Normal
      Median/Mean_Coverage_Tumour
      FWHM_Tumour
      Somatic_Mutation_Calling_Coverage
      %_of_paired_reads_mapping_to_different_chromosomes_Normal
      %_of_paired_reads_mapping_to_different_chromosomes_Tumour
      Ratio_of_difference_in_edits_between_paired_reads_Normal
      Ratio_of_difference_in_edits_between_paired_reads_Tumour
      runtime
    
  
  
    
      0
      BLCA-US
      096b4f32-10c1-4737-a0dd-cae04c54ee33
      e0fccaf5-925a-41f9-b87c-cd5ee4aecb59
      301d6ce3-4099-4c1d-8e50-c04b7ce91450
      5.0
      37.98
      45.40
      1.026162
      0.088868
      1.038914
      0.152645
      2.802268e+09
      2.14
      2.32
      1.177728
      1.200458
      4.371731
    
    
      1
      BLCA-US
      178b28cd-99c3-48dc-8d09-1ef71b4cee80
      c1da8eed-4919-4ba5-a735-3fba476c18a7
      4838b5a9-968c-4178-bffb-3fafe1f6dc09
      5.0
      31.50
      33.65
      1.015003
      0.114352
      0.981037
      0.097032
      2.746502e+09
      2.41
      1.80
      1.219270
      1.093826
      3.998751
    
    
      2
      BLCA-US
      1e308b12-0590-4dae-94d0-a539fcf25df7
      d6e91f4c-38c0-4393-9cb7-be076663dff3
      c66c92d5-df65-46e6-861d-d8a98808e6a3
      5.0
      32.72
      30.71
      1.021017
      0.095867
      0.968090
      0.085069
      2.733112e+09
      1.97
      2.03
      1.178552
      1.159640
      4.016203
    
    
      3
      BLCA-US
      24f21425-b001-4986-aedf-5b4dd851c6ad
      f88b2f4c-15f1-4808-bcec-89abc7466de6
      973d0577-8ca4-44a1-817f-1d3c1bada151
      4.5
      37.60
      39.17
      1.012372
      0.093099
      0.893826
      0.070527
      2.773264e+09
      2.93
      2.12
      1.188315
      1.198255
      4.248801
    
    
      4
      BLCA-US
      3ed614e7-f356-4d87-985b-d3bbbae3bb40
      c14e29aa-a979-4452-8019-7eebfb3d5d04
      91f458e6-64b7-454d-a542-b0aa23638fd8
      5.0
      31.90
      33.55
      1.011833
      0.083130
      0.931186
      0.081127
      2.633835e+09
      2.05
      1.95
      1.003750
      1.063998
      4.080832



In [3]:

    
# now I have a merged dataframe that has all QC values along with the runtime for the workflow
print(data.shape)



In [4]:

    
# Import matplotlib
import matplotlib.pyplot as plt

# Make a histogram of all the ratings in the average_rating column.
plt.hist(data["runtime"])

# Show the plot.
plt.show()



In [5]:

    
# remove 0 runtimes
data = data[data["runtime"] > 0.0]
plt.hist(data["runtime"])
plt.show()



In [6]:

    
# remove any NAN
data = data.dropna() 
data.isnull().values.any()









    Out[6]:





False



In [7]:

    
# showing zeros and nan have been removed
print(data.shape)



In [8]:

    
# general stats
data.describe()









    Out[8]:






  
    
      
      Stars
      Mean_Coverage_Normal
      Mean_Coverage_Tumour
      Median/Mean_Coverage_Normal
      FWHM_Normal
      Median/Mean_Coverage_Tumour
      FWHM_Tumour
      Somatic_Mutation_Calling_Coverage
      %_of_paired_reads_mapping_to_different_chromosomes_Normal
      %_of_paired_reads_mapping_to_different_chromosomes_Tumour
      Ratio_of_difference_in_edits_between_paired_reads_Normal
      Ratio_of_difference_in_edits_between_paired_reads_Tumour
      runtime
    
  
  
    
      count
      2113.000000
      2113.000000
      2113.000000
      2113.000000
      2113.000000
      2113.000000
      2113.000000
      2.113000e+03
      2113.000000
      2113.000000
      2113.000000
      2113.000000
      2113.000000
    
    
      mean
      4.683152
      39.157208
      52.017123
      1.019204
      0.106341
      0.999745
      0.113314
      2.770680e+09
      1.866072
      1.747421
      1.359632
      1.366029
      4.503757
    
    
      std
      0.569348
      9.716525
      15.818012
      0.026829
      0.062765
      0.041127
      0.066906
      1.098822e+08
      2.294404
      1.289708
      0.338257
      0.308380
      0.370143
    
    
      min
      1.000000
      19.410000
      7.450000
      0.560367
      0.046188
      0.773088
      0.046799
      1.546442e+08
      0.290000
      0.300000
      1.000506
      1.000963
      3.377180
    
    
      25%
      4.500000
      33.700000
      38.500000
      1.012596
      0.076133
      0.981827
      0.076157
      2.762527e+09
      0.990000
      1.000000
      1.180890
      1.174684
      4.259262
    
    
      50%
      5.000000
      37.290000
      50.990000
      1.023925
      0.087289
      1.010183
      0.093383
      2.800246e+09
      1.410000
      1.420000
      1.278489
      1.284797
      4.474562
    
    
      75%
      5.000000
      41.950000
      63.390000
      1.029809
      0.113010
      1.026581
      0.127158
      2.816524e+09
      2.070000
      2.030000
      1.437030
      1.456764
      4.700574
    
    
      max
      5.000000
      177.890000
      218.520000
      1.085272
      0.770586
      1.131986
      0.860889
      2.846089e+09
      57.900000
      16.570000
      5.083394
      3.829608
      6.788969



In [9]:

    
# look at correlation
data.corr()["runtime"]









    Out[9]:





Stars                                                       -0.028838
Mean_Coverage_Normal                                         0.242747
Mean_Coverage_Tumour                                         0.568345
Median/Mean_Coverage_Normal                                 -0.038067
FWHM_Normal                                                  0.010407
Median/Mean_Coverage_Tumour                                 -0.063889
FWHM_Tumour                                                  0.026909
Somatic_Mutation_Calling_Coverage                            0.232295
%_of_paired_reads_mapping_to_different_chromosomes_Normal    0.031297
%_of_paired_reads_mapping_to_different_chromosomes_Tumour    0.121966
Ratio_of_difference_in_edits_between_paired_reads_Normal     0.061750
Ratio_of_difference_in_edits_between_paired_reads_Tumour     0.089012
runtime                                                      1.000000
Name: runtime, dtype: float64



In [10]:

    
# visualize the relationship between the features and the response using scatterplots
import matplotlib.pyplot as plt
fig, axs = plt.subplots(4, 3, sharey=True)
fig.subplots_adjust(hspace=.5, wspace=.5)
data.plot(kind='scatter', x='Stars', y='runtime', ax=axs[0, 0], figsize=(16, 16))
data.plot(kind='scatter', x='Mean_Coverage_Normal', y='runtime', ax=axs[0, 1])
data.plot(kind='scatter', x='Mean_Coverage_Tumour', y='runtime', ax=axs[0, 2])
data.plot(kind='scatter', x='FWHM_Normal', y='runtime', ax=axs[1, 0])
data.plot(kind='scatter', x='Median/Mean_Coverage_Tumour', y='runtime', ax=axs[1, 1])
data.plot(kind='scatter', x='FWHM_Tumour', y='runtime', ax=axs[1, 2])
data.plot(kind='scatter', x='Somatic_Mutation_Calling_Coverage', y='runtime', ax=axs[2, 0])
data.plot(kind='scatter', x='%_of_paired_reads_mapping_to_different_chromosomes_Normal', y='runtime', ax=axs[2, 1])
data.plot(kind='scatter', x='%_of_paired_reads_mapping_to_different_chromosomes_Tumour', y='runtime', ax=axs[2, 2])
data.plot(kind='scatter', x='Ratio_of_difference_in_edits_between_paired_reads_Normal', y='runtime', ax=axs[3, 0])
data.plot(kind='scatter', x='Ratio_of_difference_in_edits_between_paired_reads_Tumour', y='runtime', ax=axs[3, 1])
data.plot(kind='scatter', x='runtime', y='runtime', ax=axs[3, 2])









    Out[10]:





<matplotlib.axes._subplots.AxesSubplot at 0x118827eb8>



In [11]:

    
# now clear out the columns that we don't want to use
# Get all the columns from the dataframe.
columns = data.columns.tolist()
# Filter the columns to remove ones we don't want.
columns = [c for c in columns if c not in ["runtime", "Project_code", "Submitter_donor_ID", "Normal_WGS_aliquot_ID", "Tumour_WGS_aliquot_ID"]]
#columns = [c for c in columns if c not in ["runtime", "Project_code", "Submitter_donor_ID", "Normal_WGS_aliquot_ID", "Tumour_WGS_aliquot_ID", "Median/Mean_Coverage_Normal", "FWHM_Normal", "Median/Mean_Coverage_Tumour", "FWHM_Tumour", "Somatic_Mutation_Calling_Coverage", "%_of_paired_reads_mapping_to_different_chromosomes_Normal", "Ratio_of_difference_in_edits_between_paired_reads_Normal", "Ratio_of_difference_in_edits_between_paired_reads_Tumour"]]

# Store the variable we'll be predicting on.
target = "runtime"



In [12]:

    
# Import a convenience function to split the sets.
from sklearn.model_selection import train_test_split

# Generate the training set.  Set random_state to be able to replicate results.
train = data.sample(frac=0.8, random_state=1)
# Select anything not in the training set and put it in the testing set.
test = data.loc[~data.index.isin(train.index)]
# Print the shapes of both sets.
print(train.shape)
print(test.shape)









    



(1690, 17)
(423, 17)



In [13]:

    
# Import the linearregression model.
from sklearn.linear_model import LinearRegression

# Initialize the model class.
model = LinearRegression()
train.head()
# Fit the model to the training data.
model.fit(train[columns], train[target])









    Out[13]:





LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)



In [14]:

    
# now test 
# Import the scikit-learn function to compute error.
from sklearn.metrics import mean_squared_error

# Generate our predictions for the test set.
predictions = model.predict(test[columns])

# Compute error between our test predictions and the actual values.
mean_squared_error(predictions, test[target])









    Out[14]:





0.081986504095072368



In [15]:

    
# try random forest
# Import the random forest model.
from sklearn.ensemble import RandomForestRegressor

# Initialize the model with some parameters.
model = RandomForestRegressor(n_estimators=100, min_samples_leaf=10, random_state=1)
# Fit the model to the data.
model.fit(train[columns], train[target])
# Make predictions.
predictions = model.predict(test[columns])
# Compute the error.
mean_squared_error(predictions, test[target])









    Out[15]:





0.081098434284504028



In [18]:

    
# look at predicted vs. actual
import statsmodels.api as sm
import numpy as np


data['predicted_runtime'] = model.predict(data[columns])
#data.plot(kind='scatter', x='runtime', y='predicted_runtime')

results = sm.OLS(data['predicted_runtime'],sm.add_constant(data['runtime'])).fit()

print(results.summary())

plt.scatter(data['runtime'],data['predicted_runtime'])

#X_plot = np.linspace(0,1,100)
#plt.plot(X_plot, X_plot*results.params[0] + results.params[1])

# fit with np.polyfit
m, b = np.polyfit(data['runtime'], data['predicted_runtime'], 1)

#plt.plot(x, y, '.')
plt.plot(data['runtime'], m*data['runtime'] + b, '-', color='r')
plt.ylabel('predicted runtime (log)')
plt.xlabel('runtime (log)')
plt.show()









    



                            OLS Regression Results                            
==============================================================================
Dep. Variable:      predicted_runtime   R-squared:                       0.559
Model:                            OLS   Adj. R-squared:                  0.559
Method:                 Least Squares   F-statistic:                     2681.
Date:                Sun, 05 Feb 2017   Prob (F-statistic):               0.00
Time:                        22:46:17   Log-Likelihood:                 874.21
No. Observations:                2113   AIC:                            -1744.
Df Residuals:                    2111   BIC:                            -1733.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [95.0% Conf. Int.]
------------------------------------------------------------------------------
const          2.3089      0.043     54.300      0.000         2.226     2.392
runtime        0.4872      0.009     51.775      0.000         0.469     0.506
==============================================================================
Omnibus:                      174.458   Durbin-Watson:                   1.233
Prob(Omnibus):                  0.000   Jarque-Bera (JB):              586.212
Skew:                          -0.379   Prob(JB):                    5.08e-128
Kurtosis:                       5.466   Cond. No.                         57.9
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

Summary

Seems like the coverage for tumor dominates the runtime. The model agrees with actual runtime not particularly well, R^2 of 0.559



In [ ]:

	Project_code	Submitter_donor_ID	Normal_WGS_aliquot_ID	Tumour_WGS_aliquot_ID	Stars	Mean_Coverage_Normal	Mean_Coverage_Tumour	Median/Mean_Coverage_Normal	FWHM_Normal	Median/Mean_Coverage_Tumour	FWHM_Tumour	Somatic_Mutation_Calling_Coverage	%_of_paired_reads_mapping_to_different_chromosomes_Normal	%_of_paired_reads_mapping_to_different_chromosomes_Tumour	Ratio_of_difference_in_edits_between_paired_reads_Normal	Ratio_of_difference_in_edits_between_paired_reads_Tumour	runtime
0	BLCA-US	096b4f32-10c1-4737-a0dd-cae04c54ee33	e0fccaf5-925a-41f9-b87c-cd5ee4aecb59	301d6ce3-4099-4c1d-8e50-c04b7ce91450	5.0	37.98	45.40	1.026162	0.088868	1.038914	0.152645	2.802268e+09	2.14	2.32	1.177728	1.200458	4.371731
1	BLCA-US	178b28cd-99c3-48dc-8d09-1ef71b4cee80	c1da8eed-4919-4ba5-a735-3fba476c18a7	4838b5a9-968c-4178-bffb-3fafe1f6dc09	5.0	31.50	33.65	1.015003	0.114352	0.981037	0.097032	2.746502e+09	2.41	1.80	1.219270	1.093826	3.998751
2	BLCA-US	1e308b12-0590-4dae-94d0-a539fcf25df7	d6e91f4c-38c0-4393-9cb7-be076663dff3	c66c92d5-df65-46e6-861d-d8a98808e6a3	5.0	32.72	30.71	1.021017	0.095867	0.968090	0.085069	2.733112e+09	1.97	2.03	1.178552	1.159640	4.016203
3	BLCA-US	24f21425-b001-4986-aedf-5b4dd851c6ad	f88b2f4c-15f1-4808-bcec-89abc7466de6	973d0577-8ca4-44a1-817f-1d3c1bada151	4.5	37.60	39.17	1.012372	0.093099	0.893826	0.070527	2.773264e+09	2.93	2.12	1.188315	1.198255	4.248801
4	BLCA-US	3ed614e7-f356-4d87-985b-d3bbbae3bb40	c14e29aa-a979-4452-8019-7eebfb3d5d04	91f458e6-64b7-454d-a542-b0aa23638fd8	5.0	31.90	33.55	1.011833	0.083130	0.931186	0.081127	2.633835e+09	2.05	1.95	1.003750	1.063998	4.080832

	Stars	Mean_Coverage_Normal	Mean_Coverage_Tumour	Median/Mean_Coverage_Normal	FWHM_Normal	Median/Mean_Coverage_Tumour	FWHM_Tumour	Somatic_Mutation_Calling_Coverage	%_of_paired_reads_mapping_to_different_chromosomes_Normal	%_of_paired_reads_mapping_to_different_chromosomes_Tumour	Ratio_of_difference_in_edits_between_paired_reads_Normal	Ratio_of_difference_in_edits_between_paired_reads_Tumour	runtime
count	2113.000000	2113.000000	2113.000000	2113.000000	2113.000000	2113.000000	2113.000000	2.113000e+03	2113.000000	2113.000000	2113.000000	2113.000000	2113.000000
mean	4.683152	39.157208	52.017123	1.019204	0.106341	0.999745	0.113314	2.770680e+09	1.866072	1.747421	1.359632	1.366029	4.503757
std	0.569348	9.716525	15.818012	0.026829	0.062765	0.041127	0.066906	1.098822e+08	2.294404	1.289708	0.338257	0.308380	0.370143
min	1.000000	19.410000	7.450000	0.560367	0.046188	0.773088	0.046799	1.546442e+08	0.290000	0.300000	1.000506	1.000963	3.377180
25%	4.500000	33.700000	38.500000	1.012596	0.076133	0.981827	0.076157	2.762527e+09	0.990000	1.000000	1.180890	1.174684	4.259262
50%	5.000000	37.290000	50.990000	1.023925	0.087289	1.010183	0.093383	2.800246e+09	1.410000	1.420000	1.278489	1.284797	4.474562
75%	5.000000	41.950000	63.390000	1.029809	0.113010	1.026581	0.127158	2.816524e+09	2.070000	2.030000	1.437030	1.456764	4.700574
max	5.000000	177.890000	218.520000	1.085272	0.770586	1.131986	0.860889	2.846089e+09	57.900000	16.570000	5.083394	3.829608	6.788969